Setup

First, let’s load the necessary libraries and data that will allow us to begin our investigation!

## Libraries to include
library(tidyverse)
library(lubridate)

## Load the data
# Replace the path below with the path to where your data lives
data_path <- "https://datajournalism.tech/wp-content/uploads/2019/10/wichita.csv"
stops <- read_csv(data_path)

# Additional data and fixed values we'll be using
population_2016 <- tibble(subject_race = c("asian/pacific islander", "black", "hispanic", "other/unknown","white"),
  num_people = c(19294, 42485, 65090, 16686, 245499)) %>% 
  mutate(subject_race = as.factor(subject_race))

center_lat <- 37.689820
center_lng <- -97.336454

Exploratory data analysis

colnames(stops)
##  [1] "X1"                      "raw_row_number"         
##  [3] "date"                    "time"                   
##  [5] "location"                "lat"                    
##  [7] "lng"                     "subject_age"            
##  [9] "subject_race"            "subject_sex"            
## [11] "type"                    "disposition"            
## [13] "violation"               "citation_issued"        
## [15] "outcome"                 "posted_speed"           
## [17] "vehicle_color"           "vehicle_make"           
## [19] "vehicle_model"           "vehicle_year"           
## [21] "raw_defendant_race"      "raw_defendant_ethnicity"
nrow(stops)
## [1] 57750
str(stops)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 57750 obs. of  22 variables:
##  $ X1                     : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ raw_row_number         : chr  "923578" "923657" "912091" "923680" ...
##  $ date                   : Date, format: "2016-01-01" "2016-01-01" ...
##  $ time                   : 'hms' num  18:00:00 18:08:00 18:11:00 18:13:00 ...
##   ..- attr(*, "units")= chr "secs"
##  $ location               : chr  "N WEST ST, KS, 67205" "8000 W 13TH ST N, WICHITA, KS, 67212" "500 S LIMUEL ST, WICHITA, KS, 67235" "7600 W 21ST ST N, WICHITA, KS, 67205" ...
##  $ lat                    : num  37.7 37.7 37.7 37.7 37.7 ...
##  $ lng                    : num  -97.4 -97.4 -97.5 -97.4 -97.4 ...
##  $ subject_age            : num  16 44 20 21 28 27 15 20 23 NA ...
##  $ subject_race           : chr  "white" "white" "white" "hispanic" ...
##  $ subject_sex            : chr  "female" "male" "male" "female" ...
##  $ type                   : chr  "vehicular" "vehicular" "vehicular" "vehicular" ...
##  $ disposition            : chr  "DISMISSED" "GUILTY (IVR)" "DISMISSED WITH PREJUDICE; DISMISSED WITH PREJUDICE" "GUILTY" ...
##  $ violation              : chr  "RUN STOP SIGN" "SPEED OVER LIMIT" "DUI; INATTENTIVE DRIVING" "SPEED OVER LIMIT" ...
##  $ citation_issued        : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ outcome                : chr  "citation" "citation" "citation" "citation" ...
##  $ posted_speed           : num  NA 40 NA 40 40 40 NA NA NA NA ...
##  $ vehicle_color          : chr  "BURGUNDY OR MAROON" "\"ALUMINUM, SILVER\"" "WHITE" "\"ALUMINUM, SILVER\"" ...
##  $ vehicle_make           : chr  "JEEP (1989 TO PRESENT)" "HYUNDAI" "HONDA" "TOYOTA" ...
##  $ vehicle_model          : chr  NA "TUCSON" NA NA ...
##  $ vehicle_year           : num  2008 NA NA NA NA ...
##  $ raw_defendant_race     : chr  "W" "W" "W" "W" ...
##  $ raw_defendant_ethnicity: chr  "N" "N" "N" "H" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_double(),
##   ..   raw_row_number = col_character(),
##   ..   date = col_date(format = ""),
##   ..   time = col_time(format = ""),
##   ..   location = col_character(),
##   ..   lat = col_double(),
##   ..   lng = col_double(),
##   ..   subject_age = col_double(),
##   ..   subject_race = col_character(),
##   ..   subject_sex = col_character(),
##   ..   type = col_character(),
##   ..   disposition = col_character(),
##   ..   violation = col_character(),
##   ..   citation_issued = col_logical(),
##   ..   outcome = col_character(),
##   ..   posted_speed = col_double(),
##   ..   vehicle_color = col_character(),
##   ..   vehicle_make = col_character(),
##   ..   vehicle_model = col_character(),
##   ..   vehicle_year = col_double(),
##   ..   raw_defendant_race = col_character(),
##   ..   raw_defendant_ethnicity = col_character()
##   .. )
summary(stops)
##        X1        raw_row_number          date                time         
##  Min.   :    1   Length:57750       Min.   :2016-01-01   Length:57750     
##  1st Qu.:14438   Class :character   1st Qu.:2016-03-16   Class1:hms       
##  Median :28876   Mode  :character   Median :2016-05-29   Class2:difftime  
##  Mean   :28876                      Mean   :2016-06-10   Mode  :numeric   
##  3rd Qu.:43313                      3rd Qu.:2016-08-31                    
##  Max.   :57750                      Max.   :2016-12-31                    
##                                                                           
##    location              lat             lng           subject_age   
##  Length:57750       Min.   :37.47   Min.   :-101.36   Min.   :11.00  
##  Class :character   1st Qu.:37.67   1st Qu.: -97.37   1st Qu.:24.00  
##  Mode  :character   Median :37.69   Median : -97.34   Median :33.00  
##                     Mean   :37.69   Mean   : -97.33   Mean   :36.71  
##                     3rd Qu.:37.70   3rd Qu.: -97.28   3rd Qu.:48.00  
##                     Max.   :38.48   Max.   : -96.75   Max.   :99.00  
##                     NA's   :1167    NA's   :1167      NA's   :10128  
##  subject_race       subject_sex            type          
##  Length:57750       Length:57750       Length:57750      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  disposition         violation         citation_issued   outcome         
##  Length:57750       Length:57750       Mode:logical    Length:57750      
##  Class :character   Class :character   TRUE:57750      Class :character  
##  Mode  :character   Mode  :character                   Mode  :character  
##                                                                          
##                                                                          
##                                                                          
##                                                                          
##   posted_speed    vehicle_color      vehicle_make       vehicle_model     
##  Min.   : 20.00   Length:57750       Length:57750       Length:57750      
##  1st Qu.: 30.00   Class :character   Class :character   Class :character  
##  Median : 40.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 39.93                                                           
##  3rd Qu.: 40.00                                                           
##  Max.   :304.00                                                           
##  NA's   :35149                                                            
##   vehicle_year   raw_defendant_race raw_defendant_ethnicity
##  Min.   :1962    Length:57750       Length:57750           
##  1st Qu.:2001    Class :character   Class :character       
##  Median :2005    Mode  :character   Mode  :character       
##  Mean   :2005                                              
##  3rd Qu.:2009                                              
##  Max.   :2999                                              
##  NA's   :43236
# This method uses the group_by/summarize paradigm
stops %>% 
  group_by(subject_race) %>% 
  summarize(
    n = n(),
    prop = n / nrow(.)
  )
## # A tibble: 5 x 3
##   subject_race               n   prop
##   <chr>                  <int>  <dbl>
## 1 asian/pacific islander  1607 0.0278
## 2 black                   8038 0.139 
## 3 hispanic                6709 0.116 
## 4 other/unknown           9335 0.162 
## 5 white                  32061 0.555

``

Benchmark test

We saw before that over two-thirds of stops were of black drivers. The by-race stop counts are only meaningful, though, when compared to some baseline. If the Philadelphia population was about two-thirds black, then two-thirds of stops being of black drivers wouldn’t be at all surprising.

Stop rates

In order to do this baseline comparison, we need to understand the racial demographics in our Philly population data. The data as we’ve given it to you has raw population numbers. To make it useful, we’ll need to compute the proportion of Philadelphia residents in each demographic group. (Hint: use the mutate() function.)

population_2016 %>% 
  mutate(prop = num_people / sum(num_people))
## # A tibble: 5 x 3
##   subject_race           num_people   prop
##   <fct>                       <dbl>  <dbl>
## 1 asian/pacific islander      19294 0.0496
## 2 black                       42485 0.109 
## 3 hispanic                    65090 0.167 
## 4 other/unknown               16686 0.0429
## 5 white                      245499 0.631
stop_final <- stops %>% 
  count(subject_race) %>% 
  left_join(
    population_2016,
    by = "subject_race"
  ) %>% 
  mutate(stop_rate = n / num_people)
## Warning: Column `subject_race` joining character vector and factor,
## coercing into character vector

Data visualizations

bar <- ggplot(stop_final,
       aes(x=reorder(subject_race,stop_rate), y=stop_rate))+
  geom_bar(stat="identity", 
           position="identity", 
           fill="red")+
  geom_hline(yintercept = 0) +
  labs(title="Stopped Drivers by Race",
       subtitle = "African American drivers were stopped more than White people in the city of Wichita,Kansas")+
  coord_flip()

options(scipen=10000)

bar

library(leaflet)
library(httpuv)

#Step 1. Create a color palette of your choice.
race <- colorFactor(c("beige", "black", "orange", "darkgreen", "blue"),
domain=c("white", "black", "asian/pacific islander", "hispanic", "other/unknown"),
ordered = TRUE)


#Step 3. Drop missing data values.
f <- stops %>% drop_na(lat, lng)

#Step 4. Map the data set.
leaflet(f) %>%
  addProviderTiles(providers$CartoDB) %>% 
  setView(lng= -97.336454, lat= 37.689820, zoom=16) %>% 
  addCircleMarkers(~lng,
                   ~lat,
                   popup=paste("This is a/an", f$subject_race, "and", f$subject_sex, "driver."),
                   weight= 3, 
                   radius=4, 
                   color=~race(subject_race),
                   stroke=F,
                   fillOpacity = 1)